employee <- read_dta("EmployeeStatus.dta")
employee_char <- read_dta("EmployeeCharacteristics.dta")
quits <- read_dta("Quits.dta")
quits_d <- read_dta("QuitDate.dta")
attitude <- read_dta("Attitudes.dta")
performance_p <- read_dta("Performance_Panel.dta")
performance <- read_dta("Performance.dta")
# create a list for all 7 datasets
data_list <- list(
  employee = employee,
  employee_char = employee_char,
  quits = quits,
  quits_d = quits_d,
  attitude = attitude, 
  performance_p = performance_p,
  performance = performance
)
# loop through and show summary of all 7 datasets
for (dataset_name in names(data_list)) {
  cat(dataset_name, ":\n")
  print(summary(data_list[[dataset_name]]))
  cat("\n")
}
## employee :
##     personid       treatment     
##  Min.   : 3906   Min.   :0.0000  
##  1st Qu.:19470   1st Qu.:0.0000  
##  Median :31936   Median :1.0000  
##  Mean   :29620   Mean   :0.5261  
##  3rd Qu.:39748   3rd Qu.:1.0000  
##  Max.   :45442   Max.   :1.0000  
## 
## employee_char :
##     personid     prior_experience      age            tenure      
##  Min.   : 3906   Min.   :-99.00   Min.   :-99.0   Min.   :-99.00  
##  1st Qu.:19470   1st Qu.:  0.00   1st Qu.: 22.0   1st Qu.:  9.00  
##  Median :31936   Median :  6.00   Median : 24.0   Median : 23.00  
##  Mean   :29620   Mean   : 16.93   Mean   : 23.4   Mean   : 26.22  
##  3rd Qu.:39748   3rd Qu.: 26.00   3rd Qu.: 26.0   3rd Qu.: 44.00  
##  Max.   :45442   Max.   :180.00   Max.   : 35.0   Max.   : 96.00  
##     basewage        bonus          grosswage    costofcommute   
##  Min.   :1156   Min.   : 123.2   Min.   :1388   Min.   : 0.000  
##  1st Qu.:1450   1st Qu.: 620.7   1st Qu.:2403   1st Qu.: 4.000  
##  Median :1550   Median : 909.3   Median :2786   Median : 8.000  
##  Mean   :1551   Mean   :1060.1   Mean   :2975   Mean   : 8.103  
##  3rd Qu.:1617   3rd Qu.:1294.0   3rd Qu.:3485   3rd Qu.:10.000  
##  Max.   :2567   Max.   :3965.9   Max.   :6221   Max.   :55.000  
##      rental            male           married        high_school    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.2249   Mean   :0.4659   Mean   :0.2691   Mean   :0.8434  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
## 
## quits :
##     personid        quitjob     
##  Min.   : 3906   Min.   :0.000  
##  1st Qu.:19470   1st Qu.:0.000  
##  Median :31936   Median :0.000  
##  Mean   :29620   Mean   :0.249  
##  3rd Qu.:39748   3rd Qu.:0.000  
##  Max.   :45442   Max.   :1.000  
## 
## quits_d :
##     personid          year          month         stillworking   
##  Min.   : 3906   Min.   :2010   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:19470   1st Qu.:2011   1st Qu.: 3.000   1st Qu.:1.0000  
##  Median :31936   Median :2011   Median : 5.000   Median :1.0000  
##  Mean   :29620   Mean   :2011   Mean   : 5.333   Mean   :0.8831  
##  3rd Qu.:39748   3rd Qu.:2011   3rd Qu.: 7.000   3rd Qu.:1.0000  
##  Max.   :45442   Max.   :2011   Max.   :12.000   Max.   :1.0000  
##       quit        
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.02767  
##  3rd Qu.:0.00000  
##  Max.   :1.00000  
## 
## attitude :
##     personid        surveyno      post      satisfaction      general      
##  Min.   : 3906   Min.   :1   Min.   :0.0   Min.   :1.000   Min.   : 32.00  
##  1st Qu.:18112   1st Qu.:2   1st Qu.:1.0   1st Qu.:4.000   1st Qu.: 64.50  
##  Median :31136   Median :3   Median :1.0   Median :5.000   Median : 73.00  
##  Mean   :29094   Mean   :3   Mean   :0.8   Mean   :4.696   Mean   : 72.78  
##  3rd Qu.:39512   3rd Qu.:4   3rd Qu.:1.0   3rd Qu.:6.000   3rd Qu.: 80.00  
##  Max.   :45442   Max.   :5   Max.   :1.0   Max.   :7.000   Max.   :100.00  
##       life      
##  Min.   : 2.00  
##  1st Qu.:16.00  
##  Median :22.00  
##  Mean   :21.46  
##  3rd Qu.:27.00  
##  Max.   :38.00  
## 
## performance_p :
##     personid          year          month             post      
##  Min.   : 3906   Min.   :2010   Min.   : 1.000   Min.   :0.000  
##  1st Qu.:19470   1st Qu.:2010   1st Qu.: 3.000   1st Qu.:0.000  
##  Median :31292   Median :2010   Median : 6.000   Median :0.000  
##  Mean   :29369   Mean   :2010   Mean   : 5.777   Mean   :0.425  
##  3rd Qu.:39512   3rd Qu.:2011   3rd Qu.: 8.000   3rd Qu.:1.000  
##  Max.   :45442   Max.   :2011   Max.   :12.000   Max.   :1.000  
##                                                                 
##  performance_score total_monthly_calls calls_per_hour  
##  Min.   :  49.69   Min.   :-999999     Min.   : 12.21  
##  1st Qu.:  73.42   1st Qu.:   1346     1st Qu.: 18.65  
##  Median :  79.37   Median :   1852     Median : 20.52  
##  Mean   :  79.80   Mean   :   1639     Mean   : 20.58  
##  3rd Qu.:  84.72   3rd Qu.:   2396     3rd Qu.: 22.31  
##  Max.   :1000.00   Max.   :   5035     Max.   :200.00  
##  NA's   :139                           NA's   :141     
## 
## performance :
##     personid          post     performance_score total_monthly_calls
##  Min.   : 3906   Min.   :0.0   Min.   : 49.69    Min.   :-109647    
##  1st Qu.:19470   1st Qu.:0.0   1st Qu.: 73.60    1st Qu.:   1398    
##  Median :31936   Median :0.5   Median : 78.74    Median :   1815    
##  Mean   :29620   Mean   :0.5   Mean   : 79.08    Mean   :   1615    
##  3rd Qu.:39748   3rd Qu.:1.0   3rd Qu.: 82.97    3rd Qu.:   2196    
##  Max.   :45442   Max.   :1.0   Max.   :283.19    Max.   :   4367    
##  calls_per_hour 
##  Min.   :13.88  
##  1st Qu.:18.92  
##  Median :20.53  
##  Mean   :20.60  
##  3rd Qu.:22.13  
##  Max.   :39.26
plot_scatter_first_column <- function(data, dataset_name) {
  numeric_vars <- sapply(data, is.numeric)
  numeric_names <- names(data)[numeric_vars]

  if (length(numeric_names) > 1) {
    x_axis <- numeric_names[1]  # use personid as the x-axis
    
    for (y_axis in numeric_names[-1]) {
      plot <- ggplot(data, aes_string(x = x_axis, y = y_axis)) +
        geom_point(alpha = 0.5) + 
        labs(title = paste("Scatter Plot of", y_axis, "vs", x_axis, "in", dataset_name),
             x = x_axis, y = y_axis) +
        theme_minimal()
      
      print(plot) 
    }
  } else {
    cat("No enough numeric variables.")
  }
}
# loop through each dataset and generate plots
for (dataset_name in names(data_list)) {
  cat("Generating plots for", dataset_name, "\n")
  plot_scatter_first_column(data_list[[dataset_name]], dataset_name)
}
## Generating plots for employee
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Generating plots for employee_char

## Generating plots for quits

## Generating plots for quits_d

## Generating plots for attitude

## Generating plots for performance_p

## Warning: Removed 139 rows containing missing values (`geom_point()`).

## Warning: Removed 141 rows containing missing values (`geom_point()`).

## Generating plots for performance

# dataset2. employee_char
data_list[[2]] <- data_list[[2]] %>%
  mutate(
    prior_experience = if_else(prior_experience < 0, NA, prior_experience),
    age = if_else(age < 0, NA, age),
    tenure = if_else(tenure < 0, NA, tenure)
  )
summary(data_list[[2]]) # double check the summary
##     personid     prior_experience      age            tenure     
##  Min.   : 3906   Min.   :  0.00   Min.   :18.00   Min.   : 2.00  
##  1st Qu.:19470   1st Qu.:  0.00   1st Qu.:22.00   1st Qu.: 9.00  
##  Median :31936   Median :  6.00   Median :24.00   Median :23.00  
##  Mean   :29620   Mean   : 17.87   Mean   :24.39   Mean   :27.23  
##  3rd Qu.:39748   3rd Qu.: 28.00   3rd Qu.:26.50   3rd Qu.:44.00  
##  Max.   :45442   Max.   :180.00   Max.   :35.00   Max.   :96.00  
##                  NA's   :2        NA's   :2       NA's   :2      
##     basewage        bonus          grosswage    costofcommute   
##  Min.   :1156   Min.   : 123.2   Min.   :1388   Min.   : 0.000  
##  1st Qu.:1450   1st Qu.: 620.7   1st Qu.:2403   1st Qu.: 4.000  
##  Median :1550   Median : 909.3   Median :2786   Median : 8.000  
##  Mean   :1551   Mean   :1060.1   Mean   :2975   Mean   : 8.103  
##  3rd Qu.:1617   3rd Qu.:1294.0   3rd Qu.:3485   3rd Qu.:10.000  
##  Max.   :2567   Max.   :3965.9   Max.   :6221   Max.   :55.000  
##                                                                 
##      rental            male           married        high_school    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.2249   Mean   :0.4659   Mean   :0.2691   Mean   :0.8434  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
## 
# dataset6. performance_p
data_list[[6]] <- data_list[[6]] %>%
  mutate(
    performance_score = ifelse(performance_score > 100, NA, performance_score),
    total_monthly_calls = ifelse(total_monthly_calls < 0, NA, total_monthly_calls),
    calls_per_hour = ifelse(calls_per_hour > 100, NA, calls_per_hour)
  )
summary(data_list[[6]])
##     personid          year          month             post      
##  Min.   : 3906   Min.   :2010   Min.   : 1.000   Min.   :0.000  
##  1st Qu.:19470   1st Qu.:2010   1st Qu.: 3.000   1st Qu.:0.000  
##  Median :31292   Median :2010   Median : 6.000   Median :0.000  
##  Mean   :29369   Mean   :2010   Mean   : 5.777   Mean   :0.425  
##  3rd Qu.:39512   3rd Qu.:2011   3rd Qu.: 8.000   3rd Qu.:1.000  
##  Max.   :45442   Max.   :2011   Max.   :12.000   Max.   :1.000  
##                                                                 
##  performance_score total_monthly_calls calls_per_hour 
##  Min.   : 49.69    Min.   :   0        Min.   :12.21  
##  1st Qu.: 73.41    1st Qu.:1346        1st Qu.:18.65  
##  Median : 79.37    Median :1852        Median :20.52  
##  Mean   : 79.17    Mean   :1860        Mean   :20.54  
##  3rd Qu.: 84.70    3rd Qu.:2396        3rd Qu.:22.31  
##  Max.   :100.00    Max.   :5035        Max.   :28.80  
##  NA's   :142       NA's   :1           NA's   :142
# dataset7. performance
data_list[[7]] <- data_list[[7]] %>%
  mutate(
    performance_score = if_else(performance_score > 100, NA, performance_score),
    total_monthly_calls = if_else(total_monthly_calls < 0, NA, total_monthly_calls),
    calls_per_hour = if_else(calls_per_hour > 35, NA, calls_per_hour)
  )
summary(data_list[[7]])
##     personid          post     performance_score total_monthly_calls
##  Min.   : 3906   Min.   :0.0   Min.   : 49.69    Min.   : 228.8     
##  1st Qu.:19470   1st Qu.:0.0   1st Qu.: 73.55    1st Qu.:1397.8     
##  Median :31936   Median :0.5   Median : 78.72    Median :1816.0     
##  Mean   :29620   Mean   :0.5   Mean   : 78.47    Mean   :1838.6     
##  3rd Qu.:39748   3rd Qu.:1.0   3rd Qu.: 82.93    3rd Qu.:2196.6     
##  Max.   :45442   Max.   :1.0   Max.   :100.00    Max.   :4367.1     
##                                NA's   :2         NA's   :1          
##  calls_per_hour 
##  Min.   :13.88  
##  1st Qu.:18.92  
##  Median :20.52  
##  Mean   :20.57  
##  3rd Qu.:22.13  
##  Max.   :28.80  
##  NA's   :1
# merge two datasets, employee and attitude
satis_lev <- inner_join(data_list[[1]], data_list[[5]], by = "personid")
sorted_satis_level <- arrange(satis_lev, personid)
head(sorted_satis_level) # preview the data
## # A tibble: 6 × 7
##   personid treatment surveyno  post satisfaction general  life
##      <dbl>     <dbl>    <dbl> <dbl>        <dbl>   <dbl> <dbl>
## 1     3906         1        1     0            7      78    18
## 2     3906         1        2     1            3      72    17
## 3     3906         1        3     1            5      65    15
## 4     3906         1        4     1            2      72    19
## 5     3906         1        5     1            6      73    20
## 6     4122         1        1     0            4      72    10
# create dummy variables and interaction terms
sorted_satis_level1 <- sorted_satis_level %>%
  mutate(time1 = as.numeric(surveyno == 1),
         time2 = as.numeric(surveyno == 2),
         time3 = as.numeric(surveyno == 3),
         time4 = as.numeric(surveyno == 4),
         time5 = as.numeric(surveyno == 5)) %>%
  mutate(time1_treat = time1 * treatment,
         time2_treat = time2 * treatment,
         time3_treat = time3 * treatment,
         time4_treat = time4 * treatment,
         time5_treat = time5 * treatment)
# run the OLS regression
model1 <- lm(satisfaction ~ time1_treat + time2_treat + time3_treat + time4_treat + time5_treat, data = sorted_satis_level1)
summary(model1)
## 
## Call:
## lm(formula = satisfaction ~ time1_treat + time2_treat + time3_treat + 
##     time4_treat + time5_treat, data = sorted_satis_level1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0841 -0.6262  0.1121  0.9159  2.6125 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.38750    0.07437  58.999  < 2e-16 ***
## time1_treat  0.23867    0.14856   1.607 0.108520    
## time2_treat  0.69661    0.14856   4.689 3.19e-06 ***
## time3_treat  0.47231    0.14856   3.179 0.001530 ** 
## time4_treat  0.50035    0.14856   3.368 0.000791 ***
## time5_treat  0.55643    0.14856   3.746 0.000192 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.33 on 849 degrees of freedom
## Multiple R-squared:  0.0387, Adjusted R-squared:  0.03304 
## F-statistic: 6.835 on 5 and 849 DF,  p-value: 2.906e-06
# merge three datasets, employee, employee_char and performance
performance <- data_list[[1]] %>%
  left_join(data_list[[2]], by = "personid") %>%
  left_join(data_list[[7]], by = "personid") %>%
  arrange(performance)
head(performance) # preview the data
## # A tibble: 6 × 17
##   personid treatment prior_experience   age tenure basewage bonus grosswage
##      <dbl>     <dbl>            <dbl> <dbl>  <dbl>    <dbl> <dbl>     <dbl>
## 1    33278         0                0    20     22     1450  594.     2275.
## 2    33278         0                0    20     22     1450  594.     2275.
## 3    44800         0                5    25      2     1500  588.     2742.
## 4    44800         0                5    25      2     1500  588.     2742.
## 5    32320         0                0    22     23     1600 2272      4578.
## 6    32320         0                0    22     23     1600 2272      4578.
## # ℹ 9 more variables: costofcommute <dbl>, rental <dbl+lbl>, male <dbl>,
## #   married <dbl>, high_school <dbl>, post <dbl>, performance_score <dbl>,
## #   total_monthly_calls <dbl>, calls_per_hour <dbl>
# create an interaction variable
performance$post_treatment <- performance$post * performance$treatment

# run OLS regression
model2 <- lm(performance_score ~ prior_experience + post + age + tenure + bonus + post_treatment, data = performance)
summary(model2)
## 
## Call:
## lm(formula = performance_score ~ prior_experience + post + age + 
##     tenure + bonus + post_treatment, data = performance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.9691  -3.8610   0.6565   4.0983  17.2268 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      72.0460028  2.4961700  28.863  < 2e-16 ***
## prior_experience -0.0215897  0.0131627  -1.640    0.102    
## post             -3.4108912  0.6524694  -5.228 2.56e-07 ***
## age               0.1198674  0.1187499   1.009    0.313    
## tenure           -0.0272765  0.0173378  -1.573    0.116    
## bonus             0.0051449  0.0005008  10.274  < 2e-16 ***
## post_treatment    3.6218727  0.7444218   4.865 1.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.811 on 485 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.2513, Adjusted R-squared:  0.242 
## F-statistic: 27.13 on 6 and 485 DF,  p-value: < 2.2e-16
# merge three datasets, employee, employee_char and quits
retention <- data_list[[1]] %>%
  left_join(data_list[[2]], by = "personid") %>%
  left_join(data_list[[3]], by = "personid")

# Sort by personid in ascending order
sorted_retention <- retention %>%
  arrange(personid)
head(sorted_retention) # preview the data
## # A tibble: 6 × 14
##   personid treatment prior_experience   age tenure basewage bonus grosswage
##      <dbl>     <dbl>            <dbl> <dbl>  <dbl>    <dbl> <dbl>     <dbl>
## 1     3906         1               84    33     96    1883.  713.     2738.
## 2     4122         1                0    30     94    1700  2097.     4096.
## 3     4448         0                0    35     92    1750  2153.     4269.
## 4     4942         1                0    27     82    1900  3043      5295.
## 5     5018         1                0    29     82    2139.  464.     1388.
## 6     5614         1                0    32     79    1700  1274.     3826.
## # ℹ 6 more variables: costofcommute <dbl>, rental <dbl+lbl>, male <dbl>,
## #   married <dbl>, high_school <dbl>, quitjob <dbl>
# conduct a logistic regression
model3 <- glm(quitjob ~  treatment + prior_experience + age + tenure + basewage + bonus + grosswage + costofcommute + rental + male + married + high_school, data = sorted_retention, family = binomial)
summary(model3)
## 
## Call:
## glm(formula = quitjob ~ treatment + prior_experience + age + 
##     tenure + basewage + bonus + grosswage + costofcommute + rental + 
##     male + married + high_school, family = binomial, data = sorted_retention)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.7319252  2.5681927   0.285 0.775647    
## treatment        -1.1453022  0.3384103  -3.384 0.000713 ***
## prior_experience  0.0110571  0.0088475   1.250 0.211395    
## age              -0.0939244  0.0894540  -1.050 0.293730    
## tenure           -0.0167165  0.0147146  -1.136 0.255934    
## basewage          0.0020688  0.0017527   1.180 0.237862    
## bonus             0.0009131  0.0009373   0.974 0.329940    
## grosswage        -0.0015550  0.0007494  -2.075 0.037974 *  
## costofcommute     0.0388097  0.0234825   1.653 0.098391 .  
## rental            0.6882141  0.4272095   1.611 0.107190    
## male              0.3721771  0.3657949   1.017 0.308941    
## married           0.2564367  0.4604334   0.557 0.577564    
## high_school       0.8911633  0.5440506   1.638 0.101418    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 278.34  on 246  degrees of freedom
## Residual deviance: 234.27  on 234  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 260.27
## 
## Number of Fisher Scoring iterations: 5